This is a quick tutorial to get started with scikit-learn.
Parts of the code presented are based on this machineLearning tutorial.
First, let's take a look at the versions of the libraries involved.
In [1]:
import numpy; print('numpy:\t', numpy.__version__, sep='\t')
import scipy; print('scipy:\t', scipy.__version__, sep='\t')
import matplotlib; print('matplotlib:', matplotlib.__version__, sep='\t')
import sklearn; print('scikit-learn:', sklearn.__version__, sep='\t')
Then load some data.
In [2]:
from sklearn import datasets
#datasets.load_ -> [press tab for completion]
iris = datasets.load_iris()
iris.keys()
Out[2]:
In [3]:
for k in iris.keys():
print('\n== ', k, '==\n', str(iris[k])[0:390])
In [4]:
for k in iris.keys():
print(k, ':', type(iris[k]))
In [5]:
[(k, iris[k].shape) for k in iris.keys() if type(iris[k]) == numpy.ndarray]
Out[5]:
In [6]:
# note: this also imports numpy as np, imports matplotlib.pyplot as plt, and others
%pylab inline
Benchmark classificator by ml-benchmarks:
In [7]:
def dtime_to_seconds(dtime):
return dtime.seconds + (dtime.microseconds * 1e-6)
def bench(func, data, n=10):
assert n > 2
score = np.inf
try:
time = []
for i in range(n):
score, t = func(*data)
time.append(dtime_to_seconds(t))
# remove extremal values
time.pop(np.argmax(time))
time.pop(np.argmin(time))
except Exception as detail:
print('%s error in function %s: ', (repr(detail), func))
time = []
return score, np.array(time)
def bench_skl(X, y, T, valid):
from sklearn import linear_model, ensemble
start = datetime.now()
# http://scikit-learn.org/stable/modules/classes.html
clf = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=5, verbose=0)
#clf = linear_model.ElasticNet(alpha=0.5, l1_ratio=0.5)
#clf = linear_model.LogisticRegression()
#clf = neighbors.NeighborsClassifier(n_neighbors=n_neighbors, algorithm='brute_inplace')
#clf = skl_cluster.KMeans(k=n_components, n_init=1)
#...
clf.fit(X, y)
## Regression
# pred = clf.predict(T)
# delta = datetime.now() - start
# mse = np.linalg.norm(pred - valid, 2) ** 2
# return mse, delta
# Classification
score = np.mean(clf.predict(T) == valid)
return score, datetime.now() - start
from sklearn import datasets
import numpy as np
from datetime import datetime
iris = datasets.load_iris()
sample_range = np.random.random_sample(size=iris.target.shape[0])
TH = 0.7
X = np.array([(iris.data[i,]) for i in range(len(iris.target)) if sample_range[i] >= TH])
Y = np.array([(iris.target[i,]) for i in range(len(iris.target)) if sample_range[i] >= TH])
T = np.array([(iris.data[i,]) for i in range(len(iris.target)) if sample_range[i] < TH])
valid = np.array([(iris.target[i,]) for i in range(len(iris.target)) if sample_range[i] < TH])
num_tries = 25
score, times = bench(bench_skl, (X,Y,T,valid), num_tries)
print('Tries:', num_tries, 'Score:', score, 'Time:', np.mean(times), '(mean)', np.median(times), '(median)')